1. Create the workspace

knitr::opts_chunk$set(echo = TRUE)
#define working directory
knitr::opts_knit$set(root.dir = '/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Dry_Lab/MetaFate_final_pipeline/data/Step3_DNA_BC_processing/all_samples/freqthresh_1/')

rm(list = ls())

#define path
path = "/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Dry_Lab/MetaFate_final_pipeline/data/Step3_DNA_BC_processing/all_samples/freqthresh_1/"

setwd(path)

library(ggplot2)
library(plyr)
library(reshape2)
library(stringr)

2. Define the helper methods we will use in the analysis

3. Merge all of the files into a single dataframe

Load the demultiplexed reads for each sample and combine them into a single data frame. In this format the columns are samples, and the rows are barcodes.

mergedData <- mergeSamples(path)

#get rid of mice where we dont have technical replicates or both M and E samples
mergedData <- mergedData[,!grepl("534",colnames(mergedData))]
mergedData <- mergedData[,!grepl("M454_M_",colnames(mergedData))]

4. normalise data

For each sample normalise the reads by dividing each barcode by the sample total. With this transformation all reads in a given sample sum to 1

norm.data <- normaliseData(mergedData)

5. assess read thresholds

some barcodes are not very common and may be the result of PCR or sequencing errors and do not represent true barcodes. We look at the cumulative read distribution to see what could be a sensible threshold to filter the barcodes on.

We filter samples based on the read abundance determined from the cumulative distribution of read counts

#fraction threshold is 0.003 based on MEF data
threshold=0.003

#plot the cumulative read distribution to see if our read threshold is sensible
for(i in 1:ncol(norm.data)){
  P = ecdf(norm.data[,i])
  plot(P)
  abline(v=threshold, col="blue")
}

6. Merge replicates

For each sample we have a and b replicates, create a new dataframe where each value represents the summed a and b replicates

#make a file with sum of duplicates
#load data with duplicates
d <- norm.data

#add duplicates
mergedReplicates <- as.data.frame(matrix(0, ncol = (dim(d)[2]-1)/2, nrow = dim(d)[1]))
j=0
for (i in seq(1,dim(d)[2],2)) {
  j=j+1
  mergedReplicates[,j] <- (d[,i]+d[,i+1])
}

names(mergedReplicates) <- c("JCW26_M1_M","JCW26_M1_E",
                             "JCW26_M2_M","JCW26_M2_E",
                             "JCW26_M3_M","JCW26_M3_E",
                             "JCW26_M4_M","JCW26_M4_E",
                             "ET27a_M1_M","ET27a_M1_E",
                             "ET27b_M1_M","ET27b_M1_E",
                             "ET21_M454_M")

rownames(mergedReplicates) <- rownames(d)

7. renorm to cell numbers

We want to convert the data from a reads scale to a cell scale so we normalise based on cell numbers that we get from our cell sorting metadata. We do this for each replicate, and so perform the normalisation on the unmerged data

cell.numbers <- read.csv("/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Dry_Lab/MetaFate_final_pipeline/data/Step3_DNA_BC_processing/cell_counts.csv")

data.cell.norm <- cellNumberNormalisation(norm.data,cell.numbers,merged = FALSE)
mergedReplicates.cell.norm <- cellNumberNormalisation(mergedReplicates,cell.numbers,merged = TRUE)

8. convert to long

Convert the data to long format to facilitate plotting

dab <- convertToLongFormat(norm.data)
dab.cellnorm <- convertToLongFormat(data.cell.norm)

9. Write unfiltered data to file

#do the same but without the cell reconversion
final.bcs <- unique(dab$tag)
final.bc.matrix <- mergedReplicates[final.bcs,]
final.bc.matrix.cellnorm <- mergedReplicates.cell.norm[final.bcs,]
colSums(final.bc.matrix > 0)
##  JCW26_M1_M  JCW26_M1_E  JCW26_M2_M  JCW26_M2_E  JCW26_M3_M  JCW26_M3_E 
##          18          15          49          61          69          54 
##  JCW26_M4_M  JCW26_M4_E  ET27a_M1_M  ET27a_M1_E  ET27b_M1_M  ET27b_M1_E 
##          35          57         300         296         174         464 
## ET21_M454_M 
##         157
write.csv(final.bc.matrix, "DRAG_DNA_barcodes_normalised_no_ab_filtering.csv")
write.csv(final.bc.matrix.cellnorm, "DRAG_DNA_barcodes_normalised_no_ab_filtering_cellnorm.csv")

#plot new self/self before abfiltering
qplot(asinh(vara), asinh(varb), data=dab) + 
  facet_wrap(~paste(mouse,expt, celltype, sep = " _ "))

#plot new self/self before abfiltering with cell normalisation
qplot(asinh(vara), asinh(varb), data=dab.cellnorm) + 
  facet_wrap(~paste(mouse,expt, celltype, sep = " _ "))

10. filtering based only on technical replicates

remove barcodes which only occur in one of the two technical replicates

dab.filtered <- dab[which(dab$vara>0 & dab$varb>0),]
dab.cellnorm.filtered <- dab.cellnorm[which(dab$vara>0 & dab$varb>0),]
final.bcs <- unique(dab.filtered$tag)

final.bc.matrix <- mergedReplicates[final.bcs,]
final.bc.matrix.cellnorm <- mergedReplicates.cell.norm[final.bcs,]
colSums(final.bc.matrix > 0)
##  JCW26_M1_M  JCW26_M1_E  JCW26_M2_M  JCW26_M2_E  JCW26_M3_M  JCW26_M3_E 
##          13          12          34          42          46          35 
##  JCW26_M4_M  JCW26_M4_E  ET27a_M1_M  ET27a_M1_E  ET27b_M1_M  ET27b_M1_E 
##          28          30          97         122          75          81 
## ET21_M454_M 
##          82
write.csv(final.bc.matrix.cellnorm, "DRAG_DNA_barcodes_normalised_and_ab_filtered_cellnorm.csv")
write.csv(final.bc.matrix, "DRAG_DNA_barcodes_normalised_and_ab_filtered.csv")
colSums(final.bc.matrix > 0)
##  JCW26_M1_M  JCW26_M1_E  JCW26_M2_M  JCW26_M2_E  JCW26_M3_M  JCW26_M3_E 
##          13          12          34          42          46          35 
##  JCW26_M4_M  JCW26_M4_E  ET27a_M1_M  ET27a_M1_E  ET27b_M1_M  ET27b_M1_E 
##          28          30          97         122          75          81 
## ET21_M454_M 
##          82
qplot(asinh(vara), asinh(varb), data=dab.filtered) + 
  facet_wrap(~paste( mouse,expt, celltype, sep = " _ "))

qplot(asinh(vara), asinh(varb), data=dab.cellnorm.filtered) + 
  facet_wrap(~paste( mouse,expt, celltype, sep = " _ "))

11. filtering based on either technical replicates or proportional abundance

filter barcodes which only occur in one of the two technical replicates or if they are only in one sample then the proportional abundance of a barcode in a sample must be above a threshold value

dab.filtered2 <- dab[(dab$vara>0 & dab$varb>0 | dab$vara > threshold| dab$varb > threshold) ,]
dab.cellnorm.filtered2 <- dab.cellnorm[(dab$vara>0 & dab$varb>0 | dab$vara > threshold| dab$varb > threshold),]
final.bcs2 <- unique(dab.filtered2$tag)
final.bc.matrix2 <- mergedReplicates[final.bcs2,]
final.bc.matrix.cellnorm2 <- mergedReplicates.cell.norm[final.bcs2,]
colSums(final.bc.matrix2 > 0)
##  JCW26_M1_M  JCW26_M1_E  JCW26_M2_M  JCW26_M2_E  JCW26_M3_M  JCW26_M3_E 
##          18          15          44          49          62          42 
##  JCW26_M4_M  JCW26_M4_E  ET27a_M1_M  ET27a_M1_E  ET27b_M1_M  ET27b_M1_E 
##          35          47         128         154         107         186 
## ET21_M454_M 
##         106
write.csv(final.bc.matrix2, "DRAG_DNA_barcodes_normalised_and_ab_filtered2.csv")
write.csv(final.bc.matrix.cellnorm2, "DRAG_DNA_barcodes_normalised_and_ab_filtered2_cellnorm.csv")

qplot(asinh(vara), asinh(varb), data=dab.filtered2) + facet_wrap(~paste( mouse,expt, celltype, sep = " _ "))

qplot(asinh(vara), asinh(varb), data=dab.cellnorm.filtered2) + facet_wrap(~paste( mouse,expt, celltype, sep = " _ "))

12. QC Plots: retention of RNA-DNA matche barcodes

let’s assess how well our filtering retains paired barcodes for which we have a higher confidence that they are a real barcode

bc.pairs <- read.csv("/Users/jasoncosgrove/Dropbox (Team_Perie)/Jason/Experiments/Dry_Lab/MetaFate_final_pipeline/data/Step6_10X_Data_integration/DNA_RNA_BC_PAIRS/all_dna_rna_bc_pairs.csv")

bc.pairs$expt <- vapply(strsplit(bc.pairs$expt_label,"_"), `[`, 1, FUN.VALUE=character(1))
bc.pairs$mouse <- vapply(strsplit(bc.pairs$expt_label,"_"), `[`, 2, FUN.VALUE=character(1))
expts <- unique(dab$expt)

for(i in 1:length(expts)){
  expt <- expts[i]
  
  dab.filtered <- dab.cellnorm.filtered2[dab.cellnorm.filtered2$expt == expt,]
  bc.pairs.filtered <- bc.pairs[bc.pairs$expt == expt,]
  pairs <- intersect(dab.filtered$tag,bc.pairs$all.bc.pairs)
  
  dab.filtered$paired <- "FALSE"
  dab.filtered[dab.filtered$tag %in% pairs,]$paired <- "TRUE"
  
  #fp <- paste("/Users/jasoncosgrove/Desktop/",expt,threshold,".tiff",sep = "")
  #tiff(fp, width = 6, height = 6,res = 200,units = "in")
  p <- qplot(asinh(vara), asinh(varb), data=dab.filtered,colour = paired) + facet_wrap(~paste(celltype,mouse, sep = " _ "))
  plot(p)
  #dev.off()

}